import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data_raw = pd.read_excel('COMAP_RollerCoasterData_2018.xlsx')
D:\software_install\anaconda\lib\site-packages\openpyxl\worksheet\_reader.py:300: UserWarning: Unknown extension is not supported and will be removed warn(msg)
data_raw.tail(10)
Name | Park | City/Region | City/State/Region | Country/Region | Geographic Region | Construction | Type | Status | Year/Date Opened | Height (feet) | Speed (mph) | Length (feet) | Inversions (YES or NO) | Number of Inversions | Drop (feet) | Duration (min:sec) | G Force | Vertical Angle (degrees) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
295 | Wildfire | Kolmarden | Norrkoping | Ostergotland | Sweden | Europe | Wood | Sit Down | Operating | 2016.0 | 183.8 | 71.500000 | 4150.300000 | YES | 3.0 | 160.8 | 02:00:00 | 4 | 83.0 |
296 | Winjas | Phantasialand | Bruhl | North Rhine-Westphalia | Germany | Europe | Steel | Sit Down | Operating | 2002.0 | 57.1 | 41.000000 | 1526.600000 | NO | 0.0 | 52.5 | NaN | NaN | NaN |
297 | Wodan Timbur Coaster | Europa Park | Rust | Baden Wuerttemberg | Germany | Europe | Wood | Sit Down | Operating | 2012.0 | 131.3 | 62.100000 | 3444.900000 | NO | 0.0 | NaN | 03:25:00 | 3.5 | NaN |
298 | X2 | Six Flags Magic Mountain | Valencia | California | United States | North America | Steel | Wing | Operating | 2002.0 | 175 | 76.000000 | 3610.000000 | YES | 2.0 | 215 | NaN | 4 | 88.5 |
299 | Xcelerator | Knott's Berry Farm | Buena Park | California | United States | North America | Steel | Sit Down | Operating | 2002.0 | 145.26957 | 63.217729 | 3278.562035 | NO | 0.0 | 01:02:00 | NaN | 90.0 | |
300 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
301 | Primary Data Sources: | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
302 | https://rcdb.com/ | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
303 | https://www.ultimaterollercoaster.com/ | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
304 | https://coasterpedia.net/ | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# 删掉300行以后的数据
data_raw = data_raw.iloc[:300,:]
data_raw.tail(10)
Name | Park | City/Region | City/State/Region | Country/Region | Geographic Region | Construction | Type | Status | Year/Date Opened | Height (feet) | Speed (mph) | Length (feet) | Inversions (YES or NO) | Number of Inversions | Drop (feet) | Duration (min:sec) | G Force | Vertical Angle (degrees) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
290 | Wicked Cyclone | Six Flags New England | Agawam | Massachusetts | United States | North America | Steel | Sit Down | Operating | 2015.0 | 109 | 55.000000 | 3320.000000 | YES | 3.0 | NaN | NaN | NaN | 78.0 |
291 | Wicked Twister | Cedar Point | Sandusky | Ohio | United States | North America | Steel | Inverted | Operating | 2002.0 | 215 | 72.000000 | 215.000000 | NO | 0.0 | 206 | NaN | NaN | 90.0 |
292 | Wild One | Six Flags America | Upper Marlboro | Maryland | United States | North America | Wood | Sit Down | Operating | 1986.0 | 98 | 53.000000 | 4000.000000 | NO | 0.0 | 88 | 01:52:00 | NaN | NaN |
293 | Wild Thing | Valleyfair! | Shakopee | Minnesota | United States | North America | Steel | Sit Down | Operating | 1996.0 | 207 | 74.000000 | 5460.000000 | NO | 0.0 | 196 | 03:00:00 | NaN | NaN |
294 | Wild Thing | Wild Waves Theme Park | Federal Way | Washington | United States | North America | Steel | Sit Down | Operating | 1997.0 | 75 | 40.000000 | 1565.000000 | YES | 3.0 | 64 | 01:00:00 | NaN | NaN |
295 | Wildfire | Kolmarden | Norrkoping | Ostergotland | Sweden | Europe | Wood | Sit Down | Operating | 2016.0 | 183.8 | 71.500000 | 4150.300000 | YES | 3.0 | 160.8 | 02:00:00 | 4 | 83.0 |
296 | Winjas | Phantasialand | Bruhl | North Rhine-Westphalia | Germany | Europe | Steel | Sit Down | Operating | 2002.0 | 57.1 | 41.000000 | 1526.600000 | NO | 0.0 | 52.5 | NaN | NaN | NaN |
297 | Wodan Timbur Coaster | Europa Park | Rust | Baden Wuerttemberg | Germany | Europe | Wood | Sit Down | Operating | 2012.0 | 131.3 | 62.100000 | 3444.900000 | NO | 0.0 | NaN | 03:25:00 | 3.5 | NaN |
298 | X2 | Six Flags Magic Mountain | Valencia | California | United States | North America | Steel | Wing | Operating | 2002.0 | 175 | 76.000000 | 3610.000000 | YES | 2.0 | 215 | NaN | 4 | 88.5 |
299 | Xcelerator | Knott's Berry Farm | Buena Park | California | United States | North America | Steel | Sit Down | Operating | 2002.0 | 145.26957 | 63.217729 | 3278.562035 | NO | 0.0 | 01:02:00 | NaN | 90.0 |
data_raw.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 300 entries, 0 to 299 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 300 non-null object 1 Park 300 non-null object 2 City/Region 300 non-null object 3 City/State/Region 300 non-null object 4 Country/Region 300 non-null object 5 Geographic Region 300 non-null object 6 Construction 300 non-null object 7 Type 300 non-null object 8 Status 300 non-null object 9 Year/Date Opened 300 non-null float64 10 Height (feet) 299 non-null object 11 Speed (mph) 296 non-null float64 12 Length (feet) 295 non-null float64 13 Inversions (YES or NO) 300 non-null object 14 Number of Inversions 300 non-null float64 15 Drop (feet) 142 non-null object 16 Duration (min:sec) 224 non-null object 17 G Force 83 non-null object 18 Vertical Angle (degrees) 91 non-null float64 dtypes: float64(5), object(14) memory usage: 44.7+ KB
某些列有很多残缺值,需要我们进行处理
# height
data_raw.columns
Index(['Name', 'Park', 'City/Region', 'City/State/Region', 'Country/Region', 'Geographic Region', 'Construction', 'Type', 'Status', 'Year/Date Opened', 'Height (feet)', ' Speed (mph)', 'Length (feet)', 'Inversions (YES or NO)', 'Number of Inversions', 'Drop (feet)', 'Duration (min:sec)', 'G Force', 'Vertical Angle (degrees)'], dtype='object')
data_raw['Height (feet)']
0 98.4 1 151.6 2 72 3 113 4 195 ... 295 183.8 296 57.1 297 131.3 298 175 299 145.26957 Name: Height (feet), Length: 300, dtype: object
type(data_raw.loc[0,'Height (feet)'])
float
# 替换掉200ft
data_raw['Height (feet)'] = data_raw['Height (feet)'].replace('200 ft','200')
for i in data_raw.index:
if float(data_raw.loc[i,'Height (feet)'])>-100:
pass
else:
print(i)
122
data_raw.loc[122,'Height (feet)']
nan
# 把所有值都转为float数据类型
data_raw['Height (feet)'] = data_raw['Height (feet)'].astype('float')
data_raw['Height (feet)'] = data_raw['Height (feet)'].fillna(data_raw['Height (feet)'].mean())
data_raw['Height (feet)'].mean()
135.51580458174766
data_raw.loc[122,'Height (feet)']
135.51580458174763
data_raw.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 300 entries, 0 to 299 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 300 non-null object 1 Park 300 non-null object 2 City/Region 300 non-null object 3 City/State/Region 300 non-null object 4 Country/Region 300 non-null object 5 Geographic Region 300 non-null object 6 Construction 300 non-null object 7 Type 300 non-null object 8 Status 300 non-null object 9 Year/Date Opened 300 non-null float64 10 Height (feet) 300 non-null float64 11 Speed (mph) 296 non-null float64 12 Length (feet) 295 non-null float64 13 Inversions (YES or NO) 300 non-null object 14 Number of Inversions 300 non-null float64 15 Drop (feet) 142 non-null object 16 Duration (min:sec) 224 non-null object 17 G Force 83 non-null object 18 Vertical Angle (degrees) 91 non-null float64 dtypes: float64(6), object(13) memory usage: 44.7+ KB
data_raw[' Speed (mph)'] = data_raw[' Speed (mph)'].fillna(data_raw[' Speed (mph)'].mean())
data_raw['Length (feet)'] = data_raw['Length (feet)'].fillna(data_raw['Length (feet)'].mean())
data_raw.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 300 entries, 0 to 299 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 300 non-null object 1 Park 300 non-null object 2 City/Region 300 non-null object 3 City/State/Region 300 non-null object 4 Country/Region 300 non-null object 5 Geographic Region 300 non-null object 6 Construction 300 non-null object 7 Type 300 non-null object 8 Status 300 non-null object 9 Year/Date Opened 300 non-null float64 10 Height (feet) 300 non-null float64 11 Speed (mph) 300 non-null float64 12 Length (feet) 300 non-null float64 13 Inversions (YES or NO) 300 non-null object 14 Number of Inversions 300 non-null float64 15 Drop (feet) 142 non-null object 16 Duration (min:sec) 224 non-null object 17 G Force 83 non-null object 18 Vertical Angle (degrees) 91 non-null float64 dtypes: float64(6), object(13) memory usage: 44.7+ KB
data_raw.to_excel('data_new.xlsx',index=False)
data_raw.columns
Index(['Name', 'Park', 'City/Region', 'City/State/Region', 'Country/Region', 'Geographic Region', 'Construction', 'Type', 'Status', 'Year/Date Opened', 'Height (feet)', ' Speed (mph)', 'Length (feet)', 'Inversions (YES or NO)', 'Number of Inversions', 'Drop (feet)', 'Duration (min:sec)', 'G Force', 'Vertical Angle (degrees)'], dtype='object')
data = data_raw[['Name','Year/Date Opened', 'Height (feet)', ' Speed (mph)', 'Length (feet)','Number of Inversions']]
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 300 entries, 0 to 299 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 300 non-null object 1 Year/Date Opened 300 non-null float64 2 Height (feet) 300 non-null float64 3 Speed (mph) 300 non-null float64 4 Length (feet) 300 non-null float64 5 Number of Inversions 300 non-null float64 dtypes: float64(5), object(1) memory usage: 14.2+ KB
data = data.set_index('Name')
data.describe()
Year/Date Opened | Height (feet) | Speed (mph) | Length (feet) | Number of Inversions | |
---|---|---|---|---|---|
count | 300.000000 | 300.000000 | 300.000000 | 300.000000 | 300.000000 |
mean | 2000.663333 | 135.515805 | 59.677425 | 3149.939227 | 2.223333 |
std | 13.163367 | 66.286588 | 16.140483 | 1442.268420 | 2.603083 |
min | 1924.000000 | 28.956000 | 28.000000 | 215.000000 | 0.000000 |
25% | 1996.000000 | 98.000000 | 49.700000 | 2260.500000 | 0.000000 |
50% | 2002.000000 | 116.500000 | 55.900000 | 3090.500000 | 1.000000 |
75% | 2009.000000 | 168.500000 | 68.950000 | 4000.650000 | 4.000000 |
max | 2018.000000 | 456.000000 | 149.100000 | 8133.200000 | 14.000000 |
p_array = np.linspace(0.0000001,1,100)
def entropy(x):
if x == 0.0:
return 0
else:
return -x*np.log(x)
y= [entropy(i) for i in p_array]
plt.plot(p_array,y)
[<matplotlib.lines.Line2D at 0x1ce7c5da370>]
例子:比较[0.5,0.5]和[0.2,0.8]谁的熵更大
entropy(0.5)+entropy(0.5)
0.6931471805599453
entropy(0.2)+entropy(0.8)
0.5004024235381879
后者提供的信息更大
data1 = data/data.sum(axis=0)
data1
Year/Date Opened | Height (feet) | Speed (mph) | Length (feet) | Number of Inversions | |
---|---|---|---|---|---|
Name | |||||
10 Inversion Roller Coaster | 0.003342 | 0.002420 | 0.002514 | 0.002951 | 0.014993 |
Abismo | 0.003342 | 0.003729 | 0.003642 | 0.001562 | 0.002999 |
Adrenaline Peak | 0.003362 | 0.001771 | 0.002514 | 0.001111 | 0.004498 |
Afterburn | 0.003331 | 0.002780 | 0.003463 | 0.003128 | 0.008996 |
Alpengeist | 0.003327 | 0.004796 | 0.003742 | 0.004051 | 0.008996 |
... | ... | ... | ... | ... | ... |
Wildfire | 0.003359 | 0.004521 | 0.003994 | 0.004392 | 0.004498 |
Winjas | 0.003336 | 0.001405 | 0.002290 | 0.001615 | 0.000000 |
Wodan Timbur Coaster | 0.003352 | 0.003230 | 0.003469 | 0.003645 | 0.000000 |
X2 | 0.003336 | 0.004305 | 0.004245 | 0.003820 | 0.002999 |
Xcelerator | 0.003336 | 0.003573 | 0.003531 | 0.003469 | 0.000000 |
300 rows × 5 columns
np.log(len(data1))*data1.applymap(entropy).sum(axis=0)
Year/Date Opened 32.533011 Height (feet) 31.926468 Speed (mph) 32.338632 Length (feet) 31.908184 Number of Inversions 28.331596 dtype: float64
w0 = 1-1/np.log(len(data1))*data1.applymap(entropy).sum(axis=0)
w0
Year/Date Opened 0.000004 Height (feet) 0.018648 Speed (mph) 0.005979 Length (feet) 0.019210 Number of Inversions 0.129146 dtype: float64
w = w0/w0.sum()
w
Year/Date Opened 0.000022 Height (feet) 0.107798 Speed (mph) 0.034561 Length (feet) 0.111047 Number of Inversions 0.746571 dtype: float64
w.plot.pie(ylabel='Weight',autopct="%.1f%%")
<AxesSubplot:ylabel='Weight'>
import seaborn as sn
sn.heatmap(data.corr())
<AxesSubplot:>
data.columns
Index(['Year/Date Opened', 'Height (feet)', ' Speed (mph)', 'Length (feet)', 'Number of Inversions'], dtype='object')
sn.lmplot(x='Height (feet)',y=' Speed (mph)',data=data1)
<seaborn.axisgrid.FacetGrid at 0x1ce7df229d0>
(此处应有一张层次图)
data.columns
Index(['Year/Date Opened', 'Height (feet)', ' Speed (mph)', 'Length (feet)', 'Number of Inversions'], dtype='object')
A = np.array([[1,2,3,4,5],
[1/2,1,3,3,5],
[1/3,1/3,1,2,1],
[1/4,1/3,1/2,1,2],
[1/5,1/3,1/2,1,1]])
A
array([[1. , 2. , 3. , 4. , 5. ], [0.5 , 1. , 3. , 3. , 5. ], [0.33333333, 0.33333333, 1. , 2. , 1. ], [0.25 , 0.33333333, 0.5 , 1. , 2. ], [0.2 , 0.33333333, 0.5 , 1. , 1. ]])
称$\lambda$是$X$的特征值,$X$是特征向量
values, vectors = np.linalg.eig(A)
values
array([ 5.25998197+0.j , 0.0361071 +0.549452j, 0.0361071 -0.549452j, -0.49162334+0.j , 0.15942716+0.j ])
vectors
array([[ 0.7662658 +0.j , 0.77211644+0.j , 0.77211644-0.j , -0.59495681+0.j , 0.9610544 +0.j ], [ 0.55316732+0.j , -0.00934684+0.56590966j, -0.00934684-0.56590966j, 0.72308748+0.j , -0.13455456+0.j ], [ 0.22487996+0.j , -0.23237243+0.03726749j, -0.23237243-0.03726749j, -0.22185629+0.j , 0.15716178+0.j ], [ 0.18459393+0.j , -0.03755482-0.15696645j, -0.03755482+0.15696645j, 0.22237227+0.j , -0.17130282+0.j ], [ 0.14898565+0.j , 0.02435853-0.03830301j, 0.02435853+0.03830301j, -0.15652874+0.j , -0.06500024+0.j ]])
w0 = vectors[:,0]
w0
array([0.7662658 +0.j, 0.55316732+0.j, 0.22487996+0.j, 0.18459393+0.j, 0.14898565+0.j])
w = w0/w0.sum()
w
array([0.40804558+0.j, 0.29456813+0.j, 0.11975123+0.j, 0.09829845+0.j, 0.07933662+0.j])
plt.pie(w,labels=data.columns,autopct='%.1f%%')
D:\softwares\anaconda3\lib\site-packages\matplotlib\axes\_axes.py:3042: ComplexWarning: Casting complex values to real discards the imaginary part x = np.asarray(x, np.float32)
([<matplotlib.patches.Wedge at 0x1ce7dfd8c10>, <matplotlib.patches.Wedge at 0x1ce7e092250>, <matplotlib.patches.Wedge at 0x1ce7e092970>, <matplotlib.patches.Wedge at 0x1ce7e0a10d0>, <matplotlib.patches.Wedge at 0x1ce7e0a17f0>], [Text(0.3133701728220095, 1.054418861167375, 'Year/Date Opened'), Text(-1.0341948520741173, -0.37475459696099095, 'Height (feet)'), Text(0.08623146118828992, -1.0966148526722281, ' Speed (mph)'), Text(0.760596389698268, -0.7946654214057388, 'Length (feet)'), Text(1.0660092844244742, -0.2713378070244185, 'Number of Inversions')], [Text(0.17092918517564154, 0.5751375606367499, '40.8%'), Text(-0.5641062829495185, -0.20441159834235867, '29.5%'), Text(0.047035342466339954, -0.5981535560030334, '12.0%'), Text(0.41487075801723705, -0.43345386622131205, '9.8%'), Text(0.5814596096860768, -0.14800244019513736, '7.9%')])
将pdf和第三题的代码文件发送到 learningmm@163.com(下周四晚之前)